Data Link: https://www.dropbox.com/sh/7fo4efxhpenexqp/AACmuri_l-LDiVDUDJ3hVLqPa?dl=0
This tutorial is based on one from ClickSecurity's Data_hacking repo: https://github.com/carriegardner428/data_hacking/blob/master/contagio_traffic_analysis/contagio_traffic_analysis.ipynb
This dataset represents samples of malicious network traffic from {}. Samples are classified into "APT", "CRIME", and "METASPLOIT", and are further divided into sample grouping
In [30]:
import os
import pandas as pd
from datetime import datetime
In [58]:
# Mapping of fields of the files we want to read in and initial setup of pandas dataframes
logs_to_process = {
'conn.log' : ['ts','uid','id.orig_h','id.orig_p','id.resp_h','id.resp_p','proto','service','duration','orig_bytes','resp_bytes','conn_state','local_orig','missed_bytes','history','orig_pkts','orig_ip_bytes','resp_pkts','resp_ip_bytes','tunnel_parents','threat','sample'],
'dns.log' : ['ts','uid','id.orig_h','id.orig_p','id.resp_h','id.resp_p','proto','trans_id','query','qclass','qclass_name','qtype','qtype_name','rcode','rcode_name','AA','TC','RD','RA','Z','answers','TTLs','rejected','threat','sample'],
'files.log' : ['ts','fuid','tx_hosts','rx_hosts','conn_uids','source','depth','analyzers','mime_type','filename','duration','local_orig','is_orig','seen_bytes','total_bytes','missing_bytes','overflow_bytes','timedout','parent_fuid','md5','sha1','sha256','extracted','threat','sample'],
'ftp.log' : ['ts','uid','id.orig_h','id.orig_p','id.resp_h','id.resp_p','user','password','command','arg','mime_type','file_size','reply_code','reply_msg','data_channel.passive','data_channel.orig_h','data_channel.resp_h','data_channel.resp_p','fuid','threat','sample'],
'http.log' : ['ts','uid','id.orig_h','id.orig_p','id.resp_h','id.resp_p','trans_depth','method','host','uri','referrer','user_agent','request_body_len','response_body_len','status_code','status_msg','info_code','info_msg','filename','tags','username','password','proxied','orig_fuids','orig_mime_types','resp_fuids','resp_mime_types','threat','sample'],
'notice.log' : ['ts','uid','id.orig_h','id.orig_p','id.resp_h','id.resp_p','fuid','file_mime_type','file_desc','proto','note','msg','sub','src','dst','p','n','peer_descr','actions','suppress_for','dropped','remote_location.country_code','remote_location.region','remote_location.city','remote_location.latitude','remote_location.longitude','threat','sample'],
'signatures.log' : ['ts','src_addr','src_port','dst_addr','dst_port','note','sig_id','event_msg','sub_msg','sig_count','host_count','threat','sample'],
'smtp.log' : ['ts','uid','id.orig_h','id.orig_p','id.resp_h','id.resp_p','trans_depth','helo','mailfrom','rcptto','date','from','to','reply_to','msg_id','in_reply_to','subject','x_originating_ip','first_received','second_received','last_reply','path','user_agent','fuids','is_webmail','threat','sample'],
'ssl.log' : ['ts','uid','id.orig_h','id.orig_p','id.resp_h','id.resp_p','version','cipher','server_name','session_id','subject','issuer_subject','not_valid_before','not_valid_after','last_alert','client_subject','client_issuer_subject','cert_hash','validation_status','threat','sample'],
'tunnel.log' : ['ts','uid','id.orig_h','id.orig_p','id.resp_h','id.resp_p','tunnel_type','action','threat','sample'],
'weird.log' : ['ts','uid','id.orig_h','id.orig_p','id.resp_h','id.resp_p','name','addl','notice','peer','threat','sample']
}
conndf = pd.DataFrame(columns=logs_to_process['conn.log'])
dnsdf = pd.DataFrame(columns=logs_to_process['dns.log'])
filesdf = pd.DataFrame(columns=logs_to_process['files.log'])
ftpdf = pd.DataFrame(columns=logs_to_process['ftp.log'])
httpdf = pd.DataFrame(columns=logs_to_process['http.log'])
noticedf = pd.DataFrame(columns=logs_to_process['notice.log'])
sigdf = pd.DataFrame(columns=logs_to_process['signatures.log'])
smtpdf = pd.DataFrame(columns=logs_to_process['smtp.log'])
ssldf = pd.DataFrame(columns=logs_to_process['ssl.log'])
tunneldf = pd.DataFrame(columns=logs_to_process['tunnel.log'])
weirddf = pd.DataFrame(columns=logs_to_process['weird.log'])
In [59]:
def clean_timestamp(df):
# Reference: https://github.com/carriegardner428/data_hacking/blob/master/contagio_traffic_analysis/contagio_traffic_analysis.ipynb
# DROP RECORDS THAT DON'T HAVE A TIMESTAMP ('#CLOSE')
df = df[df.ts.str.contains("#close") == False]
df_time = [datetime.fromtimestamp(float(date)) for date in df['ts'].values ]
df['timestamp'] = pd.Series(df_time)
df.drop('ts', axis=1, inplace=True)
df.set_index('timestamp', inplace=True)
return df
In [60]:
for dirName, subdirList, fileList in os.walk('./data/PCAPS_TRAFFIC_PATTERNS/'):
for fname in fileList:
tags = dirName.split('/')
if len(tags) == 5 and fname in logs_to_process:
logname = fname.split('.')
try:
tempdf = pd.read_csv(dirName+'/'+fname, sep='\t',skiprows=8, header=None,
names=logs_to_process[fname][:-2])
tempdf['threat'] = tags[3]
tempdf['sample'] = tags[4]
tempdf = clean_timestamp(tempdf)
if tags[2] == "0":
print ('%s/%s' %(dirName, fname))
if fname == 'conn.log':
conndf = conndf.append(tempdf)
if fname == 'dns.log':
dnsdf = dnsdf.append(tempdf)
if fname == 'files.log':
filesdf = filesdf.append(tempdf)
if fname == 'ftp.log':
ftpdf = ftpdf.append(tempdf)
if fname == 'http.log':
httpdf = httpdf.append(tempdf)
if fname == 'notice.log':
noticedf = noticedf.append(tempdf)
if fname == 'signatures.log':
sigdf = sigdf.append(tempdf)
if fname == 'smtp.log':
smtpdf = smtpdf.append(tempdf)
if fname == 'ssl.log':
ssldf = ssldf.append(tempdf)
if fname == 'tunnel.log':
tunneldf = tunneldf.append(tempdf)
if fname == 'weird.log':
weirddf = weirddf.append(tempdf)
except Exception as e:
print("Error: {}, on {}/{}".format(str(e), dirName, fname))
In [69]:
conndf.shape
Out[69]:
In [74]:
conndf.sort_index(inplace=True)
conndf.index.year.unique()
Out[74]:
Where computers even around in 1969?
(Yes, LOL ;) ). But this sample is from roughly 2008-2013. Let's make a new df with records just in that time interval
In [68]:
conndf = conndf['2008':'2013']
conndf.shape
Out[68]:
you can use df['start_time/date':'end_time/date'] to subset a collection of records in a timeframe
In [75]:
conndf.index.year.unique()
Out[75]:
dns.log
In [77]:
dnsdf.sort_index(inplace=True)
dnsdf.index.year.unique()
Out[77]:
In [87]:
dnsdf.shape
Out[87]:
In [88]:
dnsdf = dnsdf['2010':'2013']
dnsdf.shape
Out[88]:
files.log
In [78]:
filesdf.sort_index(inplace=True)
filesdf.index.year.unique()
Out[78]:
In [89]:
filesdf.shape
Out[89]:
In [90]:
filesdf = filesdf['2008':'2013']
filesdf.shape
Out[90]:
ftp.log
In [79]:
ftpdf.sort_index(inplace=True)
ftpdf.index.year.unique()
Out[79]:
In [91]:
ftpdf.shape
Out[91]:
In [92]:
ftpdf = ftpdf['2013']
ftpdf.shape
Out[92]:
http.log
In [80]:
httpdf.sort_index(inplace=True)
httpdf.index.year.unique()
Out[80]:
In [93]:
httpdf.shape
Out[93]:
In [94]:
httpdf = httpdf['2008':'2013']
httpdf.shape
Out[94]:
notice.log
In [81]:
noticedf.sort_index(inplace=True)
noticedf.index.year.unique()
Out[81]:
In [95]:
noticedf.shape
Out[95]:
In [96]:
noticedf = noticedf['2011':'2013']
noticedf.shape
Out[96]:
sig.log
In [82]:
sigdf.sort_index(inplace=True)
sigdf.index.year.unique()
Out[82]:
In [97]:
sigdf.shape
Out[97]:
smtp.log
In [83]:
smtpdf.sort_index(inplace=True)
smtpdf.index.year.unique()
Out[83]:
In [99]:
smtpdf.shape
Out[99]:
ssl.log
In [84]:
ssldf.sort_index(inplace=True)
ssldf.index.year.unique()
Out[84]:
In [100]:
ssldf.shape
Out[100]:
In [101]:
ssldf = ssldf['2011':'2013']
ssldf.shape
Out[101]:
tunnel.log
In [85]:
tunneldf.sort_index(inplace=True)
tunneldf.index.year.unique()
Out[85]:
In [102]:
tunneldf.shape
Out[102]:
weird.log
In [86]:
weirddf.sort_index(inplace=True)
weirddf.index.year.unique()
Out[86]:
In [103]:
weirddf.shape
Out[103]:
In [105]:
weirddf = weirddf['2011':'2013']
weirddf.shape
Out[105]:
In [34]:
conndf.head()
Out[34]:
In [106]:
conndf.info()
In [131]:
# Get categorical, object-type variables
conndf.select_dtypes(include=['object']).describe()
Out[131]:
In [ ]:
conndf['local_orig'].unique()
In [135]:
conndf.tunnel_parents.value_counts()
Out[135]:
In [136]:
conndf.conn_state.value_counts()
Out[136]:
In [134]:
# Get categorical, object-type variables
conndf.select_dtypes(exclude=['object']).describe()
Out[134]:
conndf TODOs:
- drop 'ts' column
- drop 'local_orig' column, 1 unique
In [126]:
conndf.drop('ts', axis=1, inplace=True)
conndf.drop('local_orig', axis=1, inplace=True)
conndf.info()
In [15]:
dnsdf.head()
Out[15]:
In [137]:
dnsdf.info()
In [138]:
# Get categorical, object-type variables
dnsdf.select_dtypes(include=['object']).describe()
Out[138]:
In [139]:
dnsdf.rejected.value_counts()
Out[139]:
In [140]:
dnsdf.qclass_name.value_counts()
Out[140]:
In [141]:
dnsdf.TC.value_counts()
Out[141]:
In [142]:
# Get categorical, object-type variables
dnsdf.select_dtypes(exclude=['object']).describe()
Out[142]:
In [143]:
dnsdf.Z.value_counts()
Out[143]:
dnsdf TODOs:
- drop 'ts' column
- drop 'tc' column, 1 unique
In [145]:
dnsdf.drop('ts', axis=1, inplace=True)
dnsdf.drop('TC', axis=1, inplace=True)
dnsdf.head()
In [16]:
filesdf.head()
Out[16]:
In [147]:
filesdf.info()
In [148]:
# Get categorical, object-type variables
filesdf.select_dtypes(include=['object']).describe()
Out[148]:
In [149]:
# Get numerical, object-type variables
filesdf.select_dtypes(exclude=['object']).describe()
Out[149]:
In [150]:
filesdf.overflow_bytes.value_counts()
Out[150]:
filesdf TODOs:
- drop 'ts' column
- drop 'extracted' column, 1 unique
- drop 'local_orig' column, 1 unique
- drop 'parent_fuid' column, 1 unique
- drop 'sha256' column, 1 unique
- drop 'overflow_bytes', column, 1 unique
In [151]:
filesdf.drop('ts', axis=1, inplace=True)
filesdf.drop('extracted', axis=1, inplace=True)
filesdf.drop('local_orig', axis=1, inplace=True)
filesdf.drop('parent_fuid', axis=1, inplace=True)
filesdf.drop('sha256', axis=1, inplace=True)
filesdf.drop('overflow_bytes', axis=1, inplace=True)
filesdf.head()
Out[151]:
In [18]:
ftpdf.head(5) # there are only 3 records
Out[18]:
In [152]:
ftpdf.info()
In [153]:
# Get categorical, object-type variables
ftpdf.select_dtypes(include=['object']).describe()
Out[153]:
In [154]:
# Get numerical, object-type variables
ftpdf.select_dtypes(exclude=['object']).describe()
Out[154]:
ftpdf TODOs:
- drop 'ts' column
- drop 'file_size' column, 1 unique
- drop 'id.orig_h' column, 1 unique
- drop 'id.resp_h' column, 1 unique
- drop 'password' column, 1 unique
- drop 'user', column, 1 unique
In [155]:
ftpdf.drop('ts', axis=1, inplace=True)
ftpdf.drop('file_size', axis=1, inplace=True)
ftpdf.drop('id.orig_h', axis=1, inplace=True)
ftpdf.drop('id.resp_h', axis=1, inplace=True)
ftpdf.drop('password', axis=1, inplace=True)
ftpdf.drop('user', axis=1, inplace=True)
ftpdf.head()
Out[155]:
In [174]:
httpdf.head(5)
Out[174]:
In [156]:
httpdf.info()
In [ ]:
In [ ]:
In [ ]:
In [20]:
noticedf.head()
Out[20]:
In [21]:
sigdf.head() # only 1 record
Out[21]:
In [24]:
smtpdf.head(5)
Out[24]:
In [25]:
ssldf.head()
Out[25]:
In [26]:
tunneldf.head() # 2 records
Out[26]:
In [29]:
weirddf.head(5)
Out[29]:
In [ ]:
### Save DFs as
In [35]:
## Connections DF
In [37]:
conndf.sort_index(inplace=True)
conndf.info()
In [40]:
conndf.index
Out[40]:
In [39]:
conndf.plot(y='duration')
In [61]:
import matplotlib
import matplotlib.pyplot
%matplotlib inline
import seaborn
In [157]:
conndf.sort_index().index
Out[157]:
In [158]:
conndf.shape
Out[158]:
In [160]:
hourly = conndf.groupby(pd.Grouper(freq='H')).count()
daily = hourly.groupby(pd.Grouper(freq='D')).mean()
In [ ]:
sns.countplot(x=)
In [176]:
hourly['2012':'2012'].uid.plot(kind='line', figsize=(15,5))
Out[176]:
In [170]:
Out[170]:
In [172]:
# Plot the average value by condition and date
ax = df.groupby(["threat",pd.Grouper(freq='H')]).count().plot()
In [177]:
### Countplots
In [181]:
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
In [179]:
conndf.select_dtypes(include=['object']).describe()
Out[179]:
In [184]:
fig, ax = plt.subplots(figsize=(15, 10))
sns.countplot(ax=ax, x="conn_state", data=conndf)
plt.xticks(rotation=90)
Out[184]:
In [186]:
fig, ax = plt.subplots(figsize=(5, 5))
sns.countplot(ax=ax, x="proto", data=conndf)
plt.xticks(rotation=90)
Out[186]:
In [187]:
fig, ax = plt.subplots(figsize=(15, 10))
sns.countplot(ax=ax, x="service", data=conndf)
plt.xticks(rotation=90)
Out[187]:
In [188]:
fig, ax = plt.subplots(figsize=(15, 10))
sns.countplot(ax=ax, x="threat", data=conndf)
plt.xticks(rotation=90)
Out[188]:
In [ ]:
#### dns.log
In [ ]:
dnsdf.select_dtypes(include=['object']).describe()